import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
df = pd.read_csv(".././data/house_sales.csv")
df.columns
Index(['price', 'num_bed', 'num_bath', 'size_house', 'size_lot', 'num_floors',
'is_waterfront', 'condition', 'size_basement', 'year_built',
'renovation_date', 'zip', 'latitude', 'longitude',
'avg_size_neighbor_houses', 'avg_size_neighbor_lot'],
dtype='object')
df.describe()
| price | num_bed | num_bath | size_house | size_lot | num_floors | is_waterfront | condition | size_basement | year_built | renovation_date | zip | latitude | longitude | avg_size_neighbor_houses | avg_size_neighbor_lot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.844800e+04 | 18448.000000 | 18448.000000 | 18448.000000 | 1.844800e+04 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 | 18448.000000 |
| mean | 5.423624e+05 | 3.372615 | 2.118888 | 2083.940915 | 1.503602e+04 | 1.494606 | 0.007643 | 3.411698 | 293.571498 | 1971.001138 | 85.145002 | 98077.921455 | 47.560030 | -122.214419 | 1988.306483 | 12571.596216 |
| std | 3.720135e+05 | 0.933892 | 0.772384 | 921.416218 | 4.181455e+04 | 0.540806 | 0.087092 | 0.652593 | 443.607503 | 29.361619 | 403.371263 | 53.497440 | 0.138557 | 0.139910 | 686.173124 | 26329.260211 |
| min | 7.800000e+04 | 0.000000 | 0.000000 | 290.000000 | 5.200000e+02 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 1900.000000 | 0.000000 | 98001.000000 | 47.155933 | -122.518648 | 399.000000 | 651.000000 |
| 25% | 3.218375e+05 | 3.000000 | 1.750000 | 1430.000000 | 5.050000e+03 | 1.000000 | 0.000000 | 3.000000 | 0.000000 | 1952.000000 | 0.000000 | 98033.000000 | 47.471527 | -122.328084 | 1490.000000 | 5100.000000 |
| 50% | 4.500000e+05 | 3.000000 | 2.250000 | 1920.000000 | 7.600500e+03 | 1.500000 | 0.000000 | 3.000000 | 0.000000 | 1975.000000 | 0.000000 | 98065.000000 | 47.571599 | -122.230688 | 1840.000000 | 7611.000000 |
| 75% | 6.480000e+05 | 4.000000 | 2.500000 | 2560.000000 | 1.062525e+04 | 2.000000 | 0.000000 | 4.000000 | 570.000000 | 1997.000000 | 0.000000 | 98118.000000 | 47.677918 | -122.125733 | 2370.000000 | 10050.000000 |
| max | 7.700000e+06 | 33.000000 | 8.000000 | 13540.000000 | 1.651359e+06 | 3.500000 | 1.000000 | 5.000000 | 4820.000000 | 2015.000000 | 2015.000000 | 98199.000000 | 47.777624 | -121.315254 | 6110.000000 | 858132.000000 |
With the histogram plots is expected to get a general idea about the distribution of the variables and possible outliers
for col_name in df.columns:
fig = px.histogram(df, x=col_name)
fig.show()
Boxplots can help us to get a better visual on the main statistics and the outliers in the data,
for col_name in df.columns:
fig = px.box(df, x=col_name)
fig.show()
Count the number of nulls and zero values in the fields.
df[df.isnull()].count()
price 0 num_bed 0 num_bath 0 size_house 0 size_lot 0 num_floors 0 is_waterfront 0 condition 0 size_basement 0 year_built 0 renovation_date 0 zip 0 latitude 0 longitude 0 avg_size_neighbor_houses 0 avg_size_neighbor_lot 0 dtype: int64
df[df == 0].count()
price 0 num_bed 12 num_bath 7 size_house 0 size_lot 0 num_floors 0 is_waterfront 18307 condition 0 size_basement 11174 year_built 0 renovation_date 17661 zip 0 latitude 0 longitude 0 avg_size_neighbor_houses 0 avg_size_neighbor_lot 0 dtype: int64
Identify the fields with higher correlation with price and between them.
corr_matrix = df.corr()
figure = px.imshow(corr_matrix, width =1000, height=1000)
figure.show()